"""
Phase 4: Regulatory/Institutional Channel Analysis
====================================================
Probes the ~70% of the demographic diversification effect NOT explained
by pension system size. Tests whether exchange rate regime, capital controls
depth, and financial market development are the dominant channels.

Author: Brian Peters
"""

import sys
sys.path.insert(0, '/mnt/c/demographics_capital_flows/multilateral/src')
from model import PanelGLS
import pandas as pd
import numpy as np
from pathlib import Path
import urllib.request
import json
import time

# ── Paths ──────────────────────────────────────────────────────────────
BASE = Path('/mnt/c/demographics_capital_flows/pension_home_bias')
DATA_RAW = BASE / 'data' / 'raw'
DATA_PROC = BASE / 'data' / 'processed'
TABLE_DIR = BASE / 'output' / 'tables'
DATA_RAW.mkdir(parents=True, exist_ok=True)
TABLE_DIR.mkdir(parents=True, exist_ok=True)

# ── Helper: significance stars ─────────────────────────────────────────
def stars(p):
    if p < 0.01: return '***'
    if p < 0.05: return '**'
    if p < 0.10: return '*'
    return ''

def fmt_coef(b, se, p):
    """Format coefficient with stars and SE in parentheses."""
    return f"{b:.4f}{stars(p)} ({se:.4f})"

def run_gls(df, y_var, x_vars, label=''):
    """Run PanelGLS and return (model, sample_df, results_dict)."""
    cols = [y_var] + x_vars + ['iso3', 'year']
    sdf = df.dropna(subset=[y_var] + x_vars).copy()
    if len(sdf) < 50:
        print(f"  [{label}] Skipped — only {len(sdf)} obs")
        return None, sdf
    gls = PanelGLS()
    gls.fit(sdf[y_var].values, sdf[x_vars].values,
            sdf['iso3'].values, sdf['year'].values)
    gls.feature_names = x_vars
    if label:
        print(f"  [{label}] N={gls.n_obs}, C={gls.n_countries}, "
              f"R²={gls.r_squared:.4f}, rho={gls.rho:.4f}")
    return gls, sdf

def coef_dict(gls, x_vars):
    """Return dict of var -> (beta, se, p)."""
    d = {}
    for i, v in enumerate(x_vars):
        d[v] = (gls.beta[i], gls.se[i], gls.pvalues[i])
    return d

# ══════════════════════════════════════════════════════════════════════
# LOAD DATA
# ══════════════════════════════════════════════════════════════════════
print("=" * 70)
print("PHASE 4: REGULATORY / INSTITUTIONAL CHANNEL ANALYSIS")
print("=" * 70)

df = pd.read_csv(DATA_PROC / 'pension_panel.csv')
df = df[df['year'] <= 2024].copy()
print(f"Loaded pension_panel: {df.shape[0]} obs, {df['iso3'].nunique()} countries, "
      f"{df['year'].min()}-{df['year'].max()}")

# Standard controls used throughout
CONTROLS = ['rgdp_growth', 'fiscal_bal_gdp', 'nfa_gdp_lag']
DEMO = ['Z_1', 'Z_2', 'Z_3']

# ══════════════════════════════════════════════════════════════════════
# PART A: ADDITIONAL INSTITUTIONAL DATA
# ══════════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("PART A: Download / Construct Additional Institutional Data")
print("=" * 70)

# --- World Bank WDI downloads ---
WDI_INDICATORS = {
    'FD.AST.PRVT.GD.ZS': 'wdi_domestic_credit',
    'CM.MKT.LCAP.GD.ZS': 'wdi_stock_mktcap',
    'GFDD.DI.14': 'wdi_stock_turnover',
}

for code, label in WDI_INDICATORS.items():
    cache_file = DATA_RAW / f'{label}.json'
    if cache_file.exists():
        print(f"  {label}: cached")
        continue
    url = (f"https://api.worldbank.org/v2/country/all/indicator/{code}"
           f"?date=1970:2024&format=json&per_page=20000")
    print(f"  Downloading {label} ({code})...")
    try:
        req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        with urllib.request.urlopen(req, timeout=30) as resp:
            raw = resp.read().decode('utf-8')
        with open(cache_file, 'w') as f:
            f.write(raw)
        print(f"    -> saved to {cache_file}")
        time.sleep(1)  # polite delay
    except Exception as e:
        print(f"    -> FAILED: {e}")

# Parse WDI downloads
for code, label in WDI_INDICATORS.items():
    cache_file = DATA_RAW / f'{label}.json'
    if not cache_file.exists():
        continue
    try:
        with open(cache_file) as f:
            data = json.load(f)
        if isinstance(data, list) and len(data) > 1:
            records = data[1]
            rows = []
            for r in records:
                if r['value'] is not None:
                    rows.append({
                        'iso3': r['countryiso3code'],
                        'year': int(r['date']),
                        label: float(r['value']),
                    })
            wdi_df = pd.DataFrame(rows)
            if len(wdi_df) > 0:
                # Merge into main panel
                before = df[label].notna().sum() if label in df.columns else 0
                df = df.merge(wdi_df, on=['iso3', 'year'], how='left',
                              suffixes=('', '_wdi'))
                # If column already existed, fill NAs from WDI
                if f'{label}_wdi' in df.columns:
                    df[label] = df[label].fillna(df[f'{label}_wdi'])
                    df.drop(columns=[f'{label}_wdi'], inplace=True)
                after = df[label].notna().sum()
                print(f"  Merged {label}: {before} -> {after} non-null obs")
    except Exception as e:
        print(f"  Error parsing {label}: {e}")

# --- Exchange rate regime proxy ---
print("\n  Constructing exchange rate regime proxies...")

# Eurozone time-varying membership
EZ_MEMBERS = {
    'AUT': 1999, 'BEL': 1999, 'FIN': 1999, 'FRA': 1999, 'DEU': 1999,
    'IRL': 1999, 'ITA': 1999, 'LUX': 1999, 'NLD': 1999, 'PRT': 1999,
    'ESP': 1999, 'GRC': 2001, 'SVN': 2007, 'CYP': 2008, 'MLT': 2008,
    'SVK': 2009, 'EST': 2011, 'LVA': 2014, 'LTU': 2015,
}
df['eurozone'] = df.apply(
    lambda r: 1 if r['iso3'] in EZ_MEMBERS and r['year'] >= EZ_MEMBERS.get(r['iso3'], 9999) else 0,
    axis=1
)
print(f"  Eurozone obs: {df['eurozone'].sum()}")

# Fixed FX proxy: eurozone members
df['fixed_fx'] = df['eurozone'].copy()

# Restricted capital proxy: kaopen < median
kaopen_median = df['kaopen'].median()
df['restricted_capital'] = (df['kaopen'] < kaopen_median).astype(int)
print(f"  Kaopen median: {kaopen_median:.3f}, restricted obs: {df['restricted_capital'].sum()}")

# Reconstruct financial_depth from WDI data
# The original panel column may be all zeros; use WDI downloads if available
credit_col = None
mktcap_col = None
for c in ['wdi_domestic_credit', 'domestic_credit_private']:
    if c in df.columns and df[c].notna().sum() > 100:
        credit_col = c
        break
for c in ['wdi_stock_mktcap', 'stock_market_cap_gdp']:
    if c in df.columns and df[c].notna().sum() > 100:
        mktcap_col = c
        break

print(f"  Credit source: {credit_col} ({df[credit_col].notna().sum() if credit_col else 0} obs)")
print(f"  Mktcap source: {mktcap_col} ({df[mktcap_col].notna().sum() if mktcap_col else 0} obs)")

if credit_col or mktcap_col:
    cr = df[credit_col] if credit_col else pd.Series(np.nan, index=df.index)
    mk = df[mktcap_col] if mktcap_col else pd.Series(np.nan, index=df.index)
    # financial_depth = sum of available components; NaN only if BOTH are NaN
    df['financial_depth'] = cr.fillna(0) + mk.fillna(0)
    # Set to NaN where both sources are NaN
    both_nan = cr.isna() & mk.isna()
    df.loc[both_nan, 'financial_depth'] = np.nan
    print(f"  Reconstructed financial_depth: {df['financial_depth'].notna().sum()} non-null, "
          f"mean={df['financial_depth'].mean():.1f}")
else:
    if 'financial_depth' not in df.columns or (df['financial_depth'] == 0).all():
        df['financial_depth'] = np.nan
        print(f"  WARNING: No source data for financial_depth")

print(f"\n  Final panel: {df.shape[0]} obs, {df['iso3'].nunique()} countries")

# ══════════════════════════════════════════════════════════════════════
# PART B: KAOPEN DEEP DIVE
# ══════════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("PART B: Capital Openness (Kaopen) Deep Dive")
print("=" * 70)

results_b = []

# B1: Z_1 x kaopen interaction
print("\n--- B1: Z_1 x kaopen interaction -> gross_assets_gdp ---")
df['Z_1_x_kaopen_int'] = df['Z_1'] * df['kaopen']
x_vars = DEMO + CONTROLS + ['kaopen', 'Z_1_x_kaopen_int']
gls, sdf = run_gls(df, 'gross_assets_gdp', x_vars, 'Z1×kaopen')
if gls:
    gls.summary(feature_names=x_vars)
    cd = coef_dict(gls, x_vars)
    results_b.append({
        'Model': 'Z_1 × kaopen interaction',
        'Sample': 'Full',
        'N': gls.n_obs, 'Countries': gls.n_countries,
        'Z_1': fmt_coef(*cd['Z_1']),
        'kaopen': fmt_coef(*cd['kaopen']),
        'Z_1×kaopen': fmt_coef(*cd['Z_1_x_kaopen_int']),
        'R²': f"{gls.r_squared:.4f}",
    })

# B2: Kaopen tercile splits
print("\n--- B2: Kaopen tercile splits -> gross_assets_gdp ---")
terciles = df.dropna(subset=['kaopen']).groupby('iso3')['kaopen'].mean()
t1, t2 = terciles.quantile(0.333), terciles.quantile(0.667)
closed_countries = terciles[terciles <= t1].index
mid_countries = terciles[(terciles > t1) & (terciles <= t2)].index
open_countries = terciles[terciles > t2].index

for label, countries in [('Closed (T1)', closed_countries),
                          ('Mid (T2)', mid_countries),
                          ('Open (T3)', open_countries)]:
    sdf = df[df['iso3'].isin(countries)]
    x_vars_t = DEMO + CONTROLS
    gls, _ = run_gls(sdf, 'gross_assets_gdp', x_vars_t, f'Tercile {label}')
    if gls:
        gls.summary(feature_names=x_vars_t)
        cd = coef_dict(gls, x_vars_t)
        results_b.append({
            'Model': f'Tercile: {label}',
            'Sample': f'{len(countries)} countries',
            'N': gls.n_obs, 'Countries': gls.n_countries,
            'Z_1': fmt_coef(*cd['Z_1']),
            'kaopen': '—',
            'Z_1×kaopen': '—',
            'R²': f"{gls.r_squared:.4f}",
        })

# B3: Open vs Closed binary split
print("\n--- B3: Open (kaopen>0) vs Closed (kaopen<=0) ---")
for label, mask in [('Open (kaopen>0)', df['kaopen'] > 0),
                     ('Closed (kaopen<=0)', df['kaopen'] <= 0)]:
    sdf = df[mask]
    x_vars_t = DEMO + CONTROLS
    gls, _ = run_gls(sdf, 'gross_assets_gdp', x_vars_t, label)
    if gls:
        gls.summary(feature_names=x_vars_t)
        cd = coef_dict(gls, x_vars_t)
        results_b.append({
            'Model': f'Binary: {label}',
            'Sample': f'{sdf["iso3"].nunique()} countries',
            'N': gls.n_obs, 'Countries': gls.n_countries,
            'Z_1': fmt_coef(*cd['Z_1']),
            'kaopen': '—',
            'Z_1×kaopen': '—',
            'R²': f"{gls.r_squared:.4f}",
        })

# Save Part B
tbl_b = pd.DataFrame(results_b)
md_b = "# Capital Openness (Kaopen) Deep Dive\n\n"
md_b += "Dependent variable: gross_assets_gdp\n\n"
md_b += tbl_b.to_markdown(index=False)
(TABLE_DIR / 'regulatory_kaopen.md').write_text(md_b)
print(f"\n  Saved -> {TABLE_DIR / 'regulatory_kaopen.md'}")

# ══════════════════════════════════════════════════════════════════════
# PART C: FINANCIAL MARKET DEPTH AS MECHANISM
# ══════════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("PART C: Financial Market Depth as Mechanism")
print("=" * 70)

results_c = []

# C1: Z -> financial_depth
print("\n--- C1: Z -> financial_depth ---")
x_vars = DEMO + CONTROLS
gls_c1, _ = run_gls(df, 'financial_depth', x_vars, 'Z→financial_depth')
if gls_c1:
    gls_c1.summary(feature_names=x_vars)
    cd = coef_dict(gls_c1, x_vars)
    results_c.append({
        'Model': 'Z → financial_depth',
        'DV': 'financial_depth',
        'Z_1_coef': cd['Z_1'][0],
        'Z_1_se': cd['Z_1'][1],
        'Z_1_p': cd['Z_1'][2],
        'N': gls_c1.n_obs,
        'R²': gls_c1.r_squared,
    })

# C2: financial_depth -> gross_assets_gdp
print("\n--- C2: financial_depth -> gross_assets_gdp ---")
x_vars = ['financial_depth'] + CONTROLS
gls_c2, _ = run_gls(df, 'gross_assets_gdp', x_vars, 'findepth→assets')
if gls_c2:
    gls_c2.summary(feature_names=x_vars)
    cd = coef_dict(gls_c2, x_vars)
    results_c.append({
        'Model': 'financial_depth → gross_assets',
        'DV': 'gross_assets_gdp',
        'Z_1_coef': np.nan,
        'Z_1_se': np.nan,
        'Z_1_p': np.nan,
        'N': gls_c2.n_obs,
        'R²': gls_c2.r_squared,
    })

# C3: Z -> gross_assets WITHOUT financial_depth
print("\n--- C3: Z -> gross_assets (no financial_depth) ---")
x_vars_no = DEMO + CONTROLS
gls_c3, _ = run_gls(df, 'gross_assets_gdp', x_vars_no, 'Z→assets (no findepth)')
z1_no_findepth = None
if gls_c3:
    gls_c3.summary(feature_names=x_vars_no)
    cd = coef_dict(gls_c3, x_vars_no)
    z1_no_findepth = cd['Z_1'][0]
    results_c.append({
        'Model': 'Z → gross_assets (no fin_depth)',
        'DV': 'gross_assets_gdp',
        'Z_1_coef': cd['Z_1'][0],
        'Z_1_se': cd['Z_1'][1],
        'Z_1_p': cd['Z_1'][2],
        'N': gls_c3.n_obs,
        'R²': gls_c3.r_squared,
    })

# C4: Z -> gross_assets WITH financial_depth
print("\n--- C4: Z -> gross_assets (with financial_depth) ---")
x_vars_fd = DEMO + CONTROLS + ['financial_depth']
gls_c4, _ = run_gls(df, 'gross_assets_gdp', x_vars_fd, 'Z→assets (+findepth)')
z1_with_findepth = None
if gls_c4:
    gls_c4.summary(feature_names=x_vars_fd)
    cd = coef_dict(gls_c4, x_vars_fd)
    z1_with_findepth = cd['Z_1'][0]
    results_c.append({
        'Model': 'Z → gross_assets (+ fin_depth)',
        'DV': 'gross_assets_gdp',
        'Z_1_coef': cd['Z_1'][0],
        'Z_1_se': cd['Z_1'][1],
        'Z_1_p': cd['Z_1'][2],
        'N': gls_c4.n_obs,
        'R²': gls_c4.r_squared,
    })

# C5: Z -> gross_assets with BOTH financial_depth AND pension_spending
print("\n--- C5: Z -> gross_assets (financial_depth + pension_spending) ---")
x_vars_both = DEMO + CONTROLS + ['financial_depth', 'pension_spending_gdp']
gls_c5, _ = run_gls(df, 'gross_assets_gdp', x_vars_both, 'Z→assets (+findepth+pension)')
z1_with_both = None
if gls_c5:
    gls_c5.summary(feature_names=x_vars_both)
    cd = coef_dict(gls_c5, x_vars_both)
    z1_with_both = cd['Z_1'][0]
    results_c.append({
        'Model': 'Z → gross_assets (+ fin_depth + pension)',
        'DV': 'gross_assets_gdp',
        'Z_1_coef': cd['Z_1'][0],
        'Z_1_se': cd['Z_1'][1],
        'Z_1_p': cd['Z_1'][2],
        'N': gls_c5.n_obs,
        'R²': gls_c5.r_squared,
    })

# Calculate mediation percentages
# Need to use common sample for valid comparison
print("\n--- Mediation % (common sample) ---")
common_vars = DEMO + CONTROLS + ['financial_depth', 'pension_spending_gdp']
common_df = df.dropna(subset=['gross_assets_gdp'] + common_vars)
print(f"  Common sample: {len(common_df)} obs, {common_df['iso3'].nunique()} countries")

# Baseline on common sample
gls_base, _ = run_gls(common_df, 'gross_assets_gdp', DEMO + CONTROLS, 'Base (common)')
z1_base_common = coef_dict(gls_base, DEMO + CONTROLS)['Z_1'][0] if gls_base else None

# + financial_depth only
gls_fd, _ = run_gls(common_df, 'gross_assets_gdp', DEMO + CONTROLS + ['financial_depth'], '+findepth (common)')
z1_fd_common = coef_dict(gls_fd, DEMO + CONTROLS + ['financial_depth'])['Z_1'][0] if gls_fd else None

# + pension only
gls_pen, _ = run_gls(common_df, 'gross_assets_gdp', DEMO + CONTROLS + ['pension_spending_gdp'], '+pension (common)')
z1_pen_common = coef_dict(gls_pen, DEMO + CONTROLS + ['pension_spending_gdp'])['Z_1'][0] if gls_pen else None

# + both
gls_both, _ = run_gls(common_df, 'gross_assets_gdp', DEMO + CONTROLS + ['financial_depth', 'pension_spending_gdp'], '+both (common)')
z1_both_common = coef_dict(gls_both, DEMO + CONTROLS + ['financial_depth', 'pension_spending_gdp'])['Z_1'][0] if gls_both else None

mediation_rows = []
if z1_base_common and z1_base_common != 0:
    if z1_fd_common is not None:
        pct_fd = 100 * (1 - z1_fd_common / z1_base_common)
        mediation_rows.append({'Channel': 'Financial depth', '% mediated': f"{pct_fd:.1f}%"})
        print(f"  Financial depth mediates: {pct_fd:.1f}% of Z_1 effect")
    if z1_pen_common is not None:
        pct_pen = 100 * (1 - z1_pen_common / z1_base_common)
        mediation_rows.append({'Channel': 'Pension spending', '% mediated': f"{pct_pen:.1f}%"})
        print(f"  Pension spending mediates: {pct_pen:.1f}% of Z_1 effect")
    if z1_both_common is not None:
        pct_both = 100 * (1 - z1_both_common / z1_base_common)
        mediation_rows.append({'Channel': 'Both combined', '% mediated': f"{pct_both:.1f}%"})
        print(f"  Both combined mediate: {pct_both:.1f}% of Z_1 effect")

# Save Part C
tbl_c = pd.DataFrame(results_c)
md_c = "# Financial Market Depth as Mechanism\n\n"
md_c += "## Regression Results\n\n"
md_c += tbl_c.to_markdown(index=False)
md_c += "\n\n## Mediation Analysis (Common Sample)\n\n"
if mediation_rows:
    md_c += pd.DataFrame(mediation_rows).to_markdown(index=False)
else:
    md_c += "Insufficient data for mediation analysis.\n"
(TABLE_DIR / 'regulatory_financial_depth.md').write_text(md_c)
print(f"\n  Saved -> {TABLE_DIR / 'regulatory_financial_depth.md'}")

# ══════════════════════════════════════════════════════════════════════
# PART D: EXCHANGE RATE REGIME
# ══════════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("PART D: Exchange Rate Regime")
print("=" * 70)

results_d = []

# D1: Eurozone vs non-eurozone
print("\n--- D1: Z -> gross_assets for Eurozone vs non-Eurozone ---")
for label, mask in [('Eurozone', df['eurozone'] == 1),
                     ('Non-Eurozone', df['eurozone'] == 0)]:
    sdf = df[mask]
    x_vars = DEMO + CONTROLS
    gls, _ = run_gls(sdf, 'gross_assets_gdp', x_vars, label)
    if gls:
        gls.summary(feature_names=x_vars)
        cd = coef_dict(gls, x_vars)
        results_d.append({
            'Model': f'Subsample: {label}',
            'N': gls.n_obs, 'Countries': gls.n_countries,
            'Z_1': fmt_coef(*cd['Z_1']),
            'Z_2': fmt_coef(*cd['Z_2']),
            'Z_3': fmt_coef(*cd['Z_3']),
            'R²': f"{gls.r_squared:.4f}",
        })

# D2: Fixed vs Flexible FX (eurozone as proxy for fixed)
print("\n--- D2: Z -> gross_assets for Fixed vs Flexible FX ---")
# Extend fixed_fx: also treat countries with very high kaopen stability as potentially pegged
# But keep it simple: eurozone = fixed
for label, mask in [('Fixed FX (EZ)', df['fixed_fx'] == 1),
                     ('Flexible FX', df['fixed_fx'] == 0)]:
    sdf = df[mask]
    x_vars = DEMO + CONTROLS
    gls, _ = run_gls(sdf, 'gross_assets_gdp', x_vars, label)
    if gls:
        gls.summary(feature_names=x_vars)
        cd = coef_dict(gls, x_vars)
        results_d.append({
            'Model': f'FX regime: {label}',
            'N': gls.n_obs, 'Countries': gls.n_countries,
            'Z_1': fmt_coef(*cd['Z_1']),
            'Z_2': fmt_coef(*cd['Z_2']),
            'Z_3': fmt_coef(*cd['Z_3']),
            'R²': f"{gls.r_squared:.4f}",
        })

# D3: Z_1 x fixed_fx interaction
print("\n--- D3: Z_1 x fixed_fx interaction ---")
df['Z_1_x_fixed_fx'] = df['Z_1'] * df['fixed_fx']
x_vars = DEMO + CONTROLS + ['fixed_fx', 'Z_1_x_fixed_fx']
gls, _ = run_gls(df, 'gross_assets_gdp', x_vars, 'Z1×fixed_fx')
if gls:
    gls.summary(feature_names=x_vars)
    cd = coef_dict(gls, x_vars)
    results_d.append({
        'Model': 'Z_1 × fixed_fx interaction',
        'N': gls.n_obs, 'Countries': gls.n_countries,
        'Z_1': fmt_coef(*cd['Z_1']),
        'Z_2': fmt_coef(*cd['Z_2']),
        'Z_3': fmt_coef(*cd['Z_3']),
        'R²': f"{gls.r_squared:.4f}",
    })
    print(f"  fixed_fx level: {fmt_coef(*cd['fixed_fx'])}")
    print(f"  Z_1×fixed_fx:   {fmt_coef(*cd['Z_1_x_fixed_fx'])}")

# Save Part D
tbl_d = pd.DataFrame(results_d)
md_d = "# Exchange Rate Regime Analysis\n\n"
md_d += "Dependent variable: gross_assets_gdp\n\n"
md_d += "Fixed FX proxy = Eurozone membership (time-varying)\n\n"
md_d += tbl_d.to_markdown(index=False)
(TABLE_DIR / 'regulatory_fx_regime.md').write_text(md_d)
print(f"\n  Saved -> {TABLE_DIR / 'regulatory_fx_regime.md'}")

# ══════════════════════════════════════════════════════════════════════
# PART E: COMPOSITE INSTITUTIONAL INDEX
# ══════════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("PART E: Composite Institutional Openness Index")
print("=" * 70)

results_e = []

# Normalize components to [0, 1]
def normalize(s):
    mn, mx = s.min(), s.max()
    if mx == mn:
        return s * 0
    return (s - mn) / (mx - mn)

df['kaopen_norm'] = normalize(df['kaopen'])
df['findepth_norm'] = normalize(df['financial_depth'])
df['ez_norm'] = df['eurozone'].astype(float)  # already 0/1

# Composite index = mean of available normalized components
df['inst_index'] = df[['kaopen_norm', 'findepth_norm', 'ez_norm']].mean(axis=1)
print(f"  Institutional index: mean={df['inst_index'].mean():.3f}, "
      f"std={df['inst_index'].std():.3f}, non-null={df['inst_index'].notna().sum()}")

# E1: Z_1 x inst_index
print("\n--- E1: Z_1 x institutional_index -> gross_assets ---")
df['Z_1_x_inst'] = df['Z_1'] * df['inst_index']
x_vars = DEMO + CONTROLS + ['inst_index', 'Z_1_x_inst']
gls, _ = run_gls(df, 'gross_assets_gdp', x_vars, 'Z1×inst_index')
if gls:
    gls.summary(feature_names=x_vars)
    cd = coef_dict(gls, x_vars)
    results_e.append({
        'Model': 'Z_1 × inst_index',
        'Sample': 'Full',
        'N': gls.n_obs, 'Countries': gls.n_countries,
        'Z_1': fmt_coef(*cd['Z_1']),
        'inst_index': fmt_coef(*cd['inst_index']),
        'Z_1×inst': fmt_coef(*cd['Z_1_x_inst']),
        'R²': f"{gls.r_squared:.4f}",
    })

# E2: Tercile splits by institutional index
print("\n--- E2: Institutional index tercile splits ---")
inst_by_country = df.dropna(subset=['inst_index']).groupby('iso3')['inst_index'].mean()
t1_inst = inst_by_country.quantile(0.333)
t2_inst = inst_by_country.quantile(0.667)
low_inst = inst_by_country[inst_by_country <= t1_inst].index
mid_inst = inst_by_country[(inst_by_country > t1_inst) & (inst_by_country <= t2_inst)].index
high_inst = inst_by_country[inst_by_country > t2_inst].index

for label, countries in [('Low institutional openness', low_inst),
                          ('Mid institutional openness', mid_inst),
                          ('High institutional openness', high_inst)]:
    sdf = df[df['iso3'].isin(countries)]
    x_vars = DEMO + CONTROLS
    gls, _ = run_gls(sdf, 'gross_assets_gdp', x_vars, label)
    if gls:
        gls.summary(feature_names=x_vars)
        cd = coef_dict(gls, x_vars)
        results_e.append({
            'Model': f'Tercile: {label}',
            'Sample': f'{len(countries)} countries',
            'N': gls.n_obs, 'Countries': gls.n_countries,
            'Z_1': fmt_coef(*cd['Z_1']),
            'inst_index': '—',
            'Z_1×inst': '—',
            'R²': f"{gls.r_squared:.4f}",
        })

# Save Part E
tbl_e = pd.DataFrame(results_e)
md_e = "# Composite Institutional Openness Index\n\n"
md_e += "Index = mean(normalize(kaopen), normalize(financial_depth), eurozone_dummy)\n\n"
md_e += "Dependent variable: gross_assets_gdp\n\n"
md_e += tbl_e.to_markdown(index=False)
(TABLE_DIR / 'regulatory_composite.md').write_text(md_e)
print(f"\n  Saved -> {TABLE_DIR / 'regulatory_composite.md'}")

# ══════════════════════════════════════════════════════════════════════
# PART F: FULL MEDIATION ACCOUNTING
# ══════════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("PART F: Full Mediation Accounting")
print("=" * 70)

# Build common sample across ALL mediators
mediation_vars = DEMO + CONTROLS + ['pension_spending_gdp', 'financial_depth', 'eurozone']
med_df = df.dropna(subset=['gross_assets_gdp'] + mediation_vars).copy()
print(f"  Common mediation sample: {len(med_df)} obs, {med_df['iso3'].nunique()} countries")

med_results = []

# Model 0: Baseline
print("\n--- Model 0: Z only (baseline) ---")
x0 = DEMO + CONTROLS
gls0, _ = run_gls(med_df, 'gross_assets_gdp', x0, 'M0: baseline')
if gls0:
    gls0.summary(feature_names=x0)
    z1_base = coef_dict(gls0, x0)['Z_1'][0]
    med_results.append({
        'Model': 'M0: Z + controls',
        'Added': '—',
        'Z_1_coef': z1_base,
        'Z_1_se': coef_dict(gls0, x0)['Z_1'][1],
        'Z_1_p': coef_dict(gls0, x0)['Z_1'][2],
        '% attenuation': '—',
        'N': gls0.n_obs,
        'R²': gls0.r_squared,
    })

# Model 1: + pension_spending
print("\n--- Model 1: + pension_spending ---")
x1 = DEMO + CONTROLS + ['pension_spending_gdp']
gls1, _ = run_gls(med_df, 'gross_assets_gdp', x1, 'M1: +pension')
if gls1 and gls0:
    gls1.summary(feature_names=x1)
    z1_m1 = coef_dict(gls1, x1)['Z_1'][0]
    att1 = 100 * (1 - z1_m1 / z1_base) if z1_base != 0 else np.nan
    med_results.append({
        'Model': 'M1: + pension_spending',
        'Added': 'pension_spending_gdp',
        'Z_1_coef': z1_m1,
        'Z_1_se': coef_dict(gls1, x1)['Z_1'][1],
        'Z_1_p': coef_dict(gls1, x1)['Z_1'][2],
        '% attenuation': f"{att1:.1f}%",
        'N': gls1.n_obs,
        'R²': gls1.r_squared,
    })

# Model 2: + financial_depth
print("\n--- Model 2: + financial_depth ---")
x2 = DEMO + CONTROLS + ['financial_depth']
gls2, _ = run_gls(med_df, 'gross_assets_gdp', x2, 'M2: +findepth')
if gls2 and gls0:
    gls2.summary(feature_names=x2)
    z1_m2 = coef_dict(gls2, x2)['Z_1'][0]
    att2 = 100 * (1 - z1_m2 / z1_base) if z1_base != 0 else np.nan
    med_results.append({
        'Model': 'M2: + financial_depth',
        'Added': 'financial_depth',
        'Z_1_coef': z1_m2,
        'Z_1_se': coef_dict(gls2, x2)['Z_1'][1],
        'Z_1_p': coef_dict(gls2, x2)['Z_1'][2],
        '% attenuation': f"{att2:.1f}%",
        'N': gls2.n_obs,
        'R²': gls2.r_squared,
    })

# Model 3: + kaopen (already in controls, so add kaopen interaction)
# Actually kaopen is NOT in CONTROLS list — it's separate. Let's add it explicitly.
print("\n--- Model 3: + kaopen (explicit) ---")
x3 = DEMO + CONTROLS + ['kaopen']
gls3, _ = run_gls(med_df, 'gross_assets_gdp', x3, 'M3: +kaopen')
if gls3 and gls0:
    gls3.summary(feature_names=x3)
    z1_m3 = coef_dict(gls3, x3)['Z_1'][0]
    att3 = 100 * (1 - z1_m3 / z1_base) if z1_base != 0 else np.nan
    med_results.append({
        'Model': 'M3: + kaopen',
        'Added': 'kaopen',
        'Z_1_coef': z1_m3,
        'Z_1_se': coef_dict(gls3, x3)['Z_1'][1],
        'Z_1_p': coef_dict(gls3, x3)['Z_1'][2],
        '% attenuation': f"{att3:.1f}%",
        'N': gls3.n_obs,
        'R²': gls3.r_squared,
    })

# Model 4: + pension + financial_depth
print("\n--- Model 4: + pension + financial_depth ---")
x4 = DEMO + CONTROLS + ['pension_spending_gdp', 'financial_depth']
gls4, _ = run_gls(med_df, 'gross_assets_gdp', x4, 'M4: +pension+findepth')
if gls4 and gls0:
    gls4.summary(feature_names=x4)
    z1_m4 = coef_dict(gls4, x4)['Z_1'][0]
    att4 = 100 * (1 - z1_m4 / z1_base) if z1_base != 0 else np.nan
    med_results.append({
        'Model': 'M4: + pension + fin_depth',
        'Added': 'pension + financial_depth',
        'Z_1_coef': z1_m4,
        'Z_1_se': coef_dict(gls4, x4)['Z_1'][1],
        'Z_1_p': coef_dict(gls4, x4)['Z_1'][2],
        '% attenuation': f"{att4:.1f}%",
        'N': gls4.n_obs,
        'R²': gls4.r_squared,
    })

# Model 5: + pension + financial_depth + eurozone
print("\n--- Model 5: + pension + financial_depth + eurozone ---")
x5 = DEMO + CONTROLS + ['pension_spending_gdp', 'financial_depth', 'eurozone']
gls5, _ = run_gls(med_df, 'gross_assets_gdp', x5, 'M5: +all')
if gls5 and gls0:
    gls5.summary(feature_names=x5)
    z1_m5 = coef_dict(gls5, x5)['Z_1'][0]
    att5 = 100 * (1 - z1_m5 / z1_base) if z1_base != 0 else np.nan
    med_results.append({
        'Model': 'M5: + pension + fin_depth + EZ',
        'Added': 'pension + fin_depth + eurozone',
        'Z_1_coef': z1_m5,
        'Z_1_se': coef_dict(gls5, x5)['Z_1'][1],
        'Z_1_p': coef_dict(gls5, x5)['Z_1'][2],
        '% attenuation': f"{att5:.1f}%",
        'N': gls5.n_obs,
        'R²': gls5.r_squared,
    })

# Save Part F
tbl_f = pd.DataFrame(med_results)
md_f = "# Full Mediation Accounting\n\n"
md_f += "Sequential addition of mediators to Z → gross_assets_gdp\n\n"
md_f += "Common sample throughout. Controls: rgdp_growth, fiscal_bal_gdp, nfa_gdp_lag\n\n"
md_f += tbl_f.to_markdown(index=False)
(TABLE_DIR / 'regulatory_mediation_accounting.md').write_text(md_f)
print(f"\n  Saved -> {TABLE_DIR / 'regulatory_mediation_accounting.md'}")

# ══════════════════════════════════════════════════════════════════════
# PART G: RESIDUAL ANALYSIS
# ══════════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("PART G: Residual Analysis — What's Left?")
print("=" * 70)

results_g = []

# Full model with all channels
print("\n--- Full model: Z + controls + pension + findepth + kaopen + eurozone ---")
x_full = DEMO + CONTROLS + ['pension_spending_gdp', 'financial_depth', 'kaopen', 'eurozone']
gls_full, sdf_full = run_gls(med_df, 'gross_assets_gdp', x_full, 'Full model')
if gls_full and gls0:
    gls_full.summary(feature_names=x_full)
    cd_full = coef_dict(gls_full, x_full)
    z1_full = cd_full['Z_1'][0]
    pct_explained = 100 * (1 - z1_full / z1_base) if z1_base != 0 else np.nan
    pct_residual = 100 - pct_explained if not np.isnan(pct_explained) else np.nan

    print(f"\n  Baseline Z_1 coefficient:  {z1_base:.4f}")
    print(f"  Full model Z_1 coefficient: {z1_full:.4f}")
    print(f"  % of Z_1 effect explained:  {pct_explained:.1f}%")
    print(f"  % residual / unexplained:   {pct_residual:.1f}%")

    results_g.append({
        'Analysis': 'Full sample',
        'Z_1_baseline': f"{z1_base:.4f}",
        'Z_1_full_model': f"{z1_full:.4f}",
        '% explained': f"{pct_explained:.1f}%",
        '% residual': f"{pct_residual:.1f}%",
        'N': gls_full.n_obs,
    })

# Residual by income group
print("\n--- Residual Z_1 by income group ---")
if 'income_group' in med_df.columns:
    for ig in sorted(med_df['income_group'].dropna().unique()):
        sdf = med_df[med_df['income_group'] == ig]
        if len(sdf) < 50:
            continue
        # Baseline for this group
        gls_ig_base, _ = run_gls(sdf, 'gross_assets_gdp', DEMO + CONTROLS, f'{ig} base')
        gls_ig_full, _ = run_gls(sdf, 'gross_assets_gdp', x_full, f'{ig} full')
        if gls_ig_base and gls_ig_full:
            z1_ig_base = coef_dict(gls_ig_base, DEMO + CONTROLS)['Z_1'][0]
            z1_ig_full = coef_dict(gls_ig_full, x_full)['Z_1'][0]
            pct = 100 * (1 - z1_ig_full / z1_ig_base) if z1_ig_base != 0 else np.nan
            results_g.append({
                'Analysis': f'Income: {ig}',
                'Z_1_baseline': f"{z1_ig_base:.4f}",
                'Z_1_full_model': f"{z1_ig_full:.4f}",
                '% explained': f"{pct:.1f}%" if not np.isnan(pct) else "N/A",
                '% residual': f"{100-pct:.1f}%" if not np.isnan(pct) else "N/A",
                'N': gls_ig_full.n_obs,
            })
            print(f"  {ig}: baseline Z_1={z1_ig_base:.4f}, full Z_1={z1_ig_full:.4f}, explained={pct:.1f}%")

# Save Part G
tbl_g = pd.DataFrame(results_g)
md_g = "# Residual Analysis\n\n"
md_g += "After controlling for pension spending, financial depth, kaopen, and eurozone membership:\n\n"
md_g += tbl_g.to_markdown(index=False)
md_g += "\n\n**Interpretation**: The residual Z_1 coefficient reflects unmeasured regulatory "
md_g += "constraints (pension investment mandates, local currency rules, home bias regulations, etc.)\n"
(TABLE_DIR / 'regulatory_residual.md').write_text(md_g)
print(f"\n  Saved -> {TABLE_DIR / 'regulatory_residual.md'}")

# ══════════════════════════════════════════════════════════════════════
# PART H: SUMMARY DECOMPOSITION
# ══════════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("PART H: Summary Decomposition")
print("=" * 70)

summary_rows = []

if gls0 and gls1 and gls2 and gls5:
    # All on common sample
    z1_m0 = z1_base
    z1_m1_val = coef_dict(gls1, DEMO + CONTROLS + ['pension_spending_gdp'])['Z_1'][0]
    z1_m2_val = coef_dict(gls2, DEMO + CONTROLS + ['financial_depth'])['Z_1'][0]

    # For incremental EZ: compare M4 vs M5
    z1_m4_val = coef_dict(gls4, DEMO + CONTROLS + ['pension_spending_gdp', 'financial_depth'])['Z_1'][0] if gls4 else z1_m0
    z1_m5_val = coef_dict(gls5, DEMO + CONTROLS + ['pension_spending_gdp', 'financial_depth', 'eurozone'])['Z_1'][0] if gls5 else z1_m4_val

    # For incremental kaopen: compare M5 vs full (M5 + kaopen)
    z1_full_val = coef_dict(gls_full, x_full)['Z_1'][0] if gls_full else z1_m5_val

    # Incremental contributions
    pct_pension = 100 * (1 - z1_m1_val / z1_m0) if z1_m0 != 0 else 0
    pct_findepth = 100 * (z1_m1_val - z1_m4_val) / z1_m0 if z1_m0 != 0 else 0  # incremental after pension
    pct_ez = 100 * (z1_m4_val - z1_m5_val) / z1_m0 if z1_m0 != 0 else 0  # incremental after pension+findepth
    pct_kaopen = 100 * (z1_m5_val - z1_full_val) / z1_m0 if z1_m0 != 0 else 0  # incremental after all
    pct_residual = 100 * z1_full_val / z1_m0 if z1_m0 != 0 else 100

    summary_rows = [
        {'Component': 'Total Z_1 effect', 'Z_1 coefficient': f"{z1_m0:.4f}", '% of total': '100.0%'},
        {'Component': 'Pension spending', 'Z_1 coefficient': f"Δ = {z1_m0 - z1_m1_val:.4f}", '% of total': f"{pct_pension:.1f}%"},
        {'Component': 'Financial depth (incremental)', 'Z_1 coefficient': f"Δ = {z1_m1_val - z1_m4_val:.4f}", '% of total': f"{pct_findepth:.1f}%"},
        {'Component': 'Exchange rate regime (incremental)', 'Z_1 coefficient': f"Δ = {z1_m4_val - z1_m5_val:.4f}", '% of total': f"{pct_ez:.1f}%"},
        {'Component': 'Capital openness (incremental)', 'Z_1 coefficient': f"Δ = {z1_m5_val - z1_full_val:.4f}", '% of total': f"{pct_kaopen:.1f}%"},
        {'Component': 'Residual / unexplained', 'Z_1 coefficient': f"{z1_full_val:.4f}", '% of total': f"{pct_residual:.1f}%"},
    ]

    print("\n  DECOMPOSITION OF Z_1 EFFECT ON GROSS EXTERNAL ASSETS")
    print("  " + "-" * 60)
    for r in summary_rows:
        print(f"  {r['Component']:<40} {r['Z_1 coefficient']:>15}  {r['% of total']:>10}")
    print("  " + "-" * 60)

# Save Part H
tbl_h = pd.DataFrame(summary_rows)
md_h = "# Summary Decomposition: Z_1 Effect on Gross External Assets\n\n"
md_h += "Sequential decomposition on common sample (countries with data on all mediators)\n\n"
md_h += f"Controls: {', '.join(CONTROLS)}\n\n"
md_h += tbl_h.to_markdown(index=False)
(TABLE_DIR / 'regulatory_summary.md').write_text(md_h)
print(f"\n  Saved -> {TABLE_DIR / 'regulatory_summary.md'}")

print("\n" + "=" * 70)
print("PHASE 4 COMPLETE")
print("=" * 70)
